Visualization of Genomic Data

Packages for Genomic Data Visualization

  • GGviz
  • trackViewer
  • ggbio: Extends ggplot2 for genomic data visualization

Install some necessary packages

BiocManager::install("BSgenome.Hsapiens.UCSC.hg19")
BiocManager::install("TxDb.Hsapiens.UCSC.hg19.knownGene")
library(Gviz)
library(trackViewer)
library(GenomicRanges)
library(rtracklayer)

Example Data: CpG Islands

data(cpgIslands)
class(cpgIslands)
[1] "GRanges"
attr(,"package")
[1] "GenomicRanges"
cpgIslands
GRanges object with 10 ranges and 0 metadata columns:
       seqnames            ranges strand
          <Rle>         <IRanges>  <Rle>
   [1]     chr7 26549019-26550183      *
   [2]     chr7 26564119-26564500      *
   [3]     chr7 26585667-26586158      *
   [4]     chr7 26591772-26593309      *
   [5]     chr7 26594192-26594570      *
   [6]     chr7 26623835-26624150      *
   [7]     chr7 26659284-26660352      *
   [8]     chr7 26721294-26721717      *
   [9]     chr7 26821518-26823297      *
  [10]     chr7 26991322-26991841      *
  -------
  seqinfo: 1 sequence from hg19 genome; no seqlengths

Plotting with Gviz

  • Function AnnotationTrack() creates a track object for plotting
  • plotTracks() function is he main interface when plotting single track objects, or lists of tracks linked together across the same genomic coordinates. The resulting plots are very similar to the graphical output of the UCSC Genome Browser
gen <- genome(cpgIslands)
print(gen)
  chr7 
"hg19" 
chr <- seqlevels(cpgIslands)
print(chr)
[1] "chr7"
atrack <- AnnotationTrack(cpgIslands, name = "CpG")
plotTracks(atrack)

Adding coordinates

gtrack <- GenomeAxisTrack()
plotTracks(list(gtrack, atrack))

Combining multiple tracks

gr1 <- GRanges(
  seqnames = Rle(c("chr1", "chr2"), c(3, 2)),
  ranges = IRanges(start = c(1, 5, 10, 15, 20), end = c(4, 8, 12, 18, 25)),
  strand = Rle(strand(c("+", "-", "+", "-", "+")))
)
gr2 <- GRanges(
  seqnames = Rle(c("chr1", "chr2"), c(3, 2)),
  ranges = IRanges(start = c(2, 6, 11, 16, 21), end = c(5, 9, 13, 19, 26)),
  strand = Rle(strand(c("+", "-", "+", "-", "+")))
)
track1 <- AnnotationTrack(gr1, name = "Track 1")
track2 <- AnnotationTrack(gr2, name = "Track 2")
plotTracks(list(gtrack, track1, track2))

Only the chr1 region is shown in the plot - if region is not specified, the entire first chromosome is plotted.

Specifying the region for plotting

Use the from and to arguments to specify the region of interest. The chromosome argument specifies the chromosome to plot.

plotTracks(list(gtrack, track1, track2), from = 1, to = 50,
             chromosome = "chr2")

Adding ideogram

If ideogram data is available, it can be added to the plot using the IdeogramTrack() function. The chromosome argument specifies the chromosome to plot.

Ideogram tracks are the one exception in all of Gviz’s track objects in the sense that they are not really displayed on the same coordinate system like all the other tracks. Instead, the current genomic location is indicated on the chromosome by a red box/line

print(gen)
  chr7 
"hg19" 
itrack <- IdeogramTrack(genome = gen, chromosome = chr)

plotTracks(list(itrack, gtrack, atrack))

Adding gene models

#
data(geneModels)
print(geneModels)
   chromosome    start      end width strand        feature            gene
1        chr7 26591441 26591829   389      +        lincRNA ENSG00000233760
2        chr7 26591458 26591829   372      +        lincRNA ENSG00000233760
3        chr7 26591515 26591829   315      +        lincRNA ENSG00000233760
4        chr7 26594428 26594538   111      +        lincRNA ENSG00000233760
5        chr7 26594428 26596819  2392      +        lincRNA ENSG00000233760
6        chr7 26594641 26594733    93      +        lincRNA ENSG00000233760
7        chr7 26594641 26594895   255      +        lincRNA ENSG00000233760
8        chr7 26677490 26677555    66      +           utr5 ENSG00000222004
9        chr7 26678114 26678763   650      +           utr5 ENSG00000222004
10       chr7 26678764 26678963   200      + protein_coding ENSG00000222004
11       chr7 26680315 26680383    69      + protein_coding ENSG00000222004
12       chr7 26685919 26686159   241      + protein_coding ENSG00000222004
13       chr7 26686160 26686889   730      +           utr3 ENSG00000222004
14       chr7 26961729 26962531   803      +     pseudogene ENSG00000213787
15       chr7 26572740 26576371  3632      -           utr3 ENSG00000122548
16       chr7 26576372 26576639   268      - protein_coding ENSG00000122548
17       chr7 26578025 26578173   149      - protein_coding ENSG00000122548
18       chr7 26578174 26578407   234      -           utr5 ENSG00000122548
19       chr7 26706681 26709275  2595      -           utr3 ENSG00000005020
20       chr7 26709048 26709275   228      -           utr3 ENSG00000005020
21       chr7 26709710 26709718     9      -           utr3 ENSG00000005020
22       chr7 26709710 26709718     9      -           utr3 ENSG00000005020
23       chr7 26709719 26709811    93      - protein_coding ENSG00000005020
24       chr7 26709719 26709811    93      - protein_coding ENSG00000005020
25       chr7 26724355 26724467   113      - protein_coding ENSG00000005020
26       chr7 26724355 26724467   113      - protein_coding ENSG00000005020
27       chr7 26729904 26729981    78      - protein_coding ENSG00000005020
28       chr7 26729904 26729981    78      - protein_coding ENSG00000005020
29       chr7 26729926 26729981    56      - protein_coding ENSG00000005020
30       chr7 26765047 26765184   138      - protein_coding ENSG00000005020
31       chr7 26765047 26765184   138      - protein_coding ENSG00000005020
32       chr7 26765047 26765184   138      - protein_coding ENSG00000005020
33       chr7 26765542 26765605    64      - protein_coding ENSG00000005020
34       chr7 26765542 26765605    64      - protein_coding ENSG00000005020
35       chr7 26765542 26765605    64      - protein_coding ENSG00000005020
36       chr7 26766301 26766625   325      - protein_coding ENSG00000005020
37       chr7 26766501 26766578    78      - protein_coding ENSG00000005020
38       chr7 26766501 26766625   125      - protein_coding ENSG00000005020
39       chr7 26766501 26766625   125      - protein_coding ENSG00000005020
40       chr7 26766508 26766625   118      - protein_coding ENSG00000005020
41       chr7 26766579 26766625    47      -           utr5 ENSG00000005020
42       chr7 26778414 26778497    84      - protein_coding ENSG00000005020
43       chr7 26778414 26778497    84      - protein_coding ENSG00000005020
44       chr7 26778414 26778497    84      - protein_coding ENSG00000005020
45       chr7 26778414 26778497    84      - protein_coding ENSG00000005020
46       chr7 26778414 26778497    84      -           utr5 ENSG00000005020
47       chr7 26778459 26778497    39      - protein_coding ENSG00000005020
48       chr7 26779506 26779583    78      - protein_coding ENSG00000005020
49       chr7 26779506 26779583    78      - protein_coding ENSG00000005020
50       chr7 26779506 26779583    78      - protein_coding ENSG00000005020
51       chr7 26779506 26779583    78      - protein_coding ENSG00000005020
52       chr7 26779506 26779583    78      - protein_coding ENSG00000005020
53       chr7 26779506 26779583    78      - protein_coding ENSG00000005020
54       chr7 26779506 26779583    78      -           utr5 ENSG00000005020
55       chr7 26781961 26782094   134      - protein_coding ENSG00000005020
56       chr7 26786186 26786305   120      - protein_coding ENSG00000005020
57       chr7 26883648 26883756   109      - protein_coding ENSG00000005020
58       chr7 26883649 26883756   108      - protein_coding ENSG00000005020
59       chr7 26883649 26883756   108      - protein_coding ENSG00000005020
60       chr7 26883649 26883756   108      - protein_coding ENSG00000005020
61       chr7 26883649 26883756   108      - protein_coding ENSG00000005020
62       chr7 26883649 26883756   108      - protein_coding ENSG00000005020
63       chr7 26883649 26883756   108      - protein_coding ENSG00000005020
64       chr7 26883649 26883756   108      - protein_coding ENSG00000005020
65       chr7 26883649 26883756   108      -           utr5 ENSG00000005020
66       chr7 26887615 26887687    73      - protein_coding ENSG00000005020
67       chr7 26893756 26893781    26      - protein_coding ENSG00000005020
68       chr7 26893756 26893781    26      - protein_coding ENSG00000005020
69       chr7 26893756 26893781    26      - protein_coding ENSG00000005020
70       chr7 26893756 26893781    26      - protein_coding ENSG00000005020
71       chr7 26893756 26893781    26      - protein_coding ENSG00000005020
72       chr7 26893756 26893781    26      - protein_coding ENSG00000005020
73       chr7 26893756 26893781    26      - protein_coding ENSG00000005020
74       chr7 26893756 26893781    26      - protein_coding ENSG00000005020
75       chr7 26893756 26893781    26      -           utr5 ENSG00000005020
76       chr7 26894404 26894509   106      - protein_coding ENSG00000005020
77       chr7 26894404 26894509   106      - protein_coding ENSG00000005020
78       chr7 26894404 26894509   106      - protein_coding ENSG00000005020
79       chr7 26894404 26894509   106      - protein_coding ENSG00000005020
80       chr7 26894404 26894509   106      - protein_coding ENSG00000005020
81       chr7 26894404 26894509   106      - protein_coding ENSG00000005020
82       chr7 26894404 26894509   106      - protein_coding ENSG00000005020
83       chr7 26894404 26894509   106      - protein_coding ENSG00000005020
84       chr7 26894404 26894509   106      -           utr5 ENSG00000005020
85       chr7 26894737 26894770    34      - protein_coding ENSG00000005020
86       chr7 26897070 26897239   170      -           utr5 ENSG00000005020
87       chr7 26897070 26897297   228      - protein_coding ENSG00000005020
88       chr7 26897539 26897778   240      - protein_coding ENSG00000005020
89       chr7 26903982 26904048    67      - protein_coding ENSG00000005020
90       chr7 26903982 26904204   223      - protein_coding ENSG00000005020
91       chr7 26903982 26904206   225      - protein_coding ENSG00000005020
92       chr7 26904049 26904362   314      -           utr5 ENSG00000005020
93       chr7 26904461 26904732   272      - protein_coding ENSG00000005020
94       chr7 26933458 26933479    22      - protein_coding ENSG00000005020
95       chr7 26933480 26933592   113      -           utr5 ENSG00000005020
96       chr7 27022387 27022986   600      -     pseudogene ENSG00000226059
97       chr7 27034774 27034858    85      -           utr5 ENSG00000005020
              exon      transcript     symbol
1  ENSE00001693369 ENST00000420912 AC004947.2
2  ENSE00001596777 ENST00000457000 AC004947.2
3  ENSE00001601658 ENST00000430426 AC004947.2
4  ENSE00001792454 ENST00000457000 AC004947.2
5  ENSE00001618328 ENST00000420912 AC004947.2
6  ENSE00001716169 ENST00000457000 AC004947.2
7  ENSE00001608876 ENST00000430426 AC004947.2
8  ENSE00001587749 ENST00000409974    C7orf71
9  ENSE00001587737 ENST00000409974    C7orf71
10 ENSE00001587737 ENST00000409974    C7orf71
11 ENSE00001576853 ENST00000409974    C7orf71
12 ENSE00001589914 ENST00000409974    C7orf71
13 ENSE00001589914 ENST00000409974    C7orf71
14 ENSE00001592282 ENST00000441433   RPL7AP38
15 ENSE00000832068 ENST00000242109   KIAA0087
16 ENSE00000832068 ENST00000242109   KIAA0087
17 ENSE00000832069 ENST00000242109   KIAA0087
18 ENSE00000832069 ENST00000242109   KIAA0087
19 ENSE00001398234 ENST00000345317      SKAP2
20 ENSE00002309019 ENST00000539623      SKAP2
21 ENSE00001365768 ENST00000345317      SKAP2
22 ENSE00001365768 ENST00000539623      SKAP2
23 ENSE00001365768 ENST00000345317      SKAP2
24 ENSE00001365768 ENST00000539623      SKAP2
25 ENSE00001085057 ENST00000345317      SKAP2
26 ENSE00001085057 ENST00000539623      SKAP2
27 ENSE00001085059 ENST00000345317      SKAP2
28 ENSE00001085059 ENST00000539623      SKAP2
29 ENSE00001818255 ENST00000489977      SKAP2
30 ENSE00002834804 ENST00000345317      SKAP2
31 ENSE00002918324 ENST00000489977      SKAP2
32 ENSE00002834804 ENST00000539623      SKAP2
33 ENSE00002968721 ENST00000345317      SKAP2
34 ENSE00002860637 ENST00000489977      SKAP2
35 ENSE00002968721 ENST00000539623      SKAP2
36 ENSE00001857596 ENST00000468712      SKAP2
37 ENSE00002870168 ENST00000539623      SKAP2
38 ENSE00002885305 ENST00000345317      SKAP2
39 ENSE00002895759 ENST00000489977      SKAP2
40 ENSE00001864376 ENST00000495802      SKAP2
41 ENSE00002870168 ENST00000539623      SKAP2
42 ENSE00002893679 ENST00000345317      SKAP2
43 ENSE00002858677 ENST00000489977      SKAP2
44 ENSE00002858677 ENST00000468712      SKAP2
45 ENSE00002858677 ENST00000495802      SKAP2
46 ENSE00002858677 ENST00000539623      SKAP2
47 ENSE00001718920 ENST00000432747      SKAP2
48 ENSE00002784468 ENST00000345317      SKAP2
49 ENSE00002926563 ENST00000489977      SKAP2
50 ENSE00002926563 ENST00000468712      SKAP2
51 ENSE00002926563 ENST00000495802      SKAP2
52 ENSE00002784468 ENST00000432747      SKAP2
53 ENSE00002926563 ENST00000490456      SKAP2
54 ENSE00002926563 ENST00000539623      SKAP2
55 ENSE00001813208 ENST00000489977      SKAP2
56 ENSE00001900804 ENST00000497511      SKAP2
57 ENSE00001949942 ENST00000481204      SKAP2
58 ENSE00002782078 ENST00000345317      SKAP2
59 ENSE00002766115 ENST00000468712      SKAP2
60 ENSE00002766115 ENST00000495802      SKAP2
61 ENSE00002782078 ENST00000432747      SKAP2
62 ENSE00002766115 ENST00000490456      SKAP2
63 ENSE00002766115 ENST00000497511      SKAP2
64 ENSE00002766115 ENST00000487720      SKAP2
65 ENSE00002766115 ENST00000539623      SKAP2
66 ENSE00001873190 ENST00000487720      SKAP2
67 ENSE00002945002 ENST00000345317      SKAP2
68 ENSE00002788238 ENST00000468712      SKAP2
69 ENSE00002788238 ENST00000495802      SKAP2
70 ENSE00002945002 ENST00000432747      SKAP2
71 ENSE00002788238 ENST00000490456      SKAP2
72 ENSE00002788238 ENST00000497511      SKAP2
73 ENSE00002788238 ENST00000481204      SKAP2
74 ENSE00002788238 ENST00000487720      SKAP2
75 ENSE00002788238 ENST00000539623      SKAP2
76 ENSE00002899828 ENST00000345317      SKAP2
77 ENSE00002888190 ENST00000468712      SKAP2
78 ENSE00002888190 ENST00000495802      SKAP2
79 ENSE00002899828 ENST00000432747      SKAP2
80 ENSE00002888190 ENST00000490456      SKAP2
81 ENSE00002888190 ENST00000497511      SKAP2
82 ENSE00002888190 ENST00000481204      SKAP2
83 ENSE00002888190 ENST00000487720      SKAP2
84 ENSE00002888190 ENST00000539623      SKAP2
85 ENSE00001889253 ENST00000495802      SKAP2
86 ENSE00002217811 ENST00000539623      SKAP2
87 ENSE00001900594 ENST00000468712      SKAP2
88 ENSE00001815407 ENST00000490456      SKAP2
89 ENSE00001220341 ENST00000345317      SKAP2
90 ENSE00001924916 ENST00000487720      SKAP2
91 ENSE00001874332 ENST00000497511      SKAP2
92 ENSE00001220341 ENST00000345317      SKAP2
93 ENSE00001818170 ENST00000481204      SKAP2
94 ENSE00001686825 ENST00000432747      SKAP2
95 ENSE00001686825 ENST00000432747      SKAP2
96 ENSE00001769159 ENST00000417997   HMGB3P20
97 ENSE00001609754 ENST00000432747      SKAP2

geneModels is a data frame object containing gene models for the human genome.

Adding gene models

grtrack <- GeneRegionTrack(geneModels, genome = gen,
                           chromosome = chr, name = "Gene Model")

plotTracks(list(itrack, gtrack, atrack, grtrack))

Setting coordinates for plotting

plotTracks supports the from and to arguments that let us choose an arbitrary genomic range to plot.

plotTracks(list(itrack, gtrack, atrack, grtrack),
           from = 26700000, to = 26750000)

Zooming to the base pair level

When zooming further we may take a look at the actual genomic sequence at a given position

library(BSgenome.Hsapiens.UCSC.hg19)
strack <- SequenceTrack(BSgenome.Hsapiens.UCSC.hg19,
                   chromosome = chr)
plotTracks(list(itrack, gtrack, atrack, grtrack, strack),
           from = 26591822, to = 26591852, cex = 0.8)

Adding a quantitative track

We can add a quantitative track to the plot using the DataTrack() function. The data argument specifies the data to plot, and the type argument specifies the type of plot (e.g., “histogram”, “density”, “heatmap”).

# making example data
set.seed(123)
lim <- c(26700000, 26750000)
coords <- seq(lim[1], lim[2], length.out = 101) |>
  round()
dat <- runif(100, min = -10, max = 10)
dtrack <- DataTrack(data = dat, start = coords[-length(coords)],
                    end = coords[-1], chromosome = chr, genome = gen,
                    name = "Uniform")
plotTracks(list(itrack, gtrack, atrack, grtrack, dtrack),
           from = lim[1], to = lim[2])

Changing type of quantitative track

dtrack <- DataTrack(data = dat, start = coords[-length(coords)],
                    end = coords[-1], chromosome = chr, genome = gen,
                    name = "Uniform", type = "histogram")
plotTracks(list(itrack, gtrack, atrack, grtrack, dtrack),
           from = lim[1], to = lim[2])

Modifying plot parameters

TrackViewer - another package for genomic data visualization

Provide similar functionality to Gviz but with a different interface. It is designed to be more user-friendly and provides a more interactive experience.

Plot interaction data with trackViewer

Chromatin interaction data is often stored in a format called InteractionSet, which is a specialized data structure for representing genomic interactions. The trackViewer package provides functions to visualize these interactions. (TAD - topolocical associated domains)

GInteractions object with 6 interactions and 1 metadata column:
      seqnames1           ranges1     seqnames2           ranges2 |     score
          <Rle>         <IRanges>         <Rle>         <IRanges> | <numeric>
  [1]      chr6 51120000-51160000 ---      chr6 51120000-51160000 |   45.1227
  [2]      chr6 51120000-51160000 ---      chr6 51160000-51200000 |   35.0006
  [3]      chr6 51120000-51160000 ---      chr6 51200000-51240000 |   44.7322
  [4]      chr6 51120000-51160000 ---      chr6 51240000-51280000 |   29.3507
  [5]      chr6 51120000-51160000 ---      chr6 51280000-51320000 |   38.8417
  [6]      chr6 51120000-51160000 ---      chr6 51320000-51360000 |   31.7063
  -------
  regions: 53 ranges and 0 metadata columns
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

Plotting interaction data

Define TAD

# Here we define a border_color to highlight some regions
gi$border_color <- NA ## highlight some regions
gi$border_color[sample(seq_along(gi), 20)] <- sample(1:7, 20, replace=TRUE)
## The TADs will be drawn as lines at points start(first), center point, end(second).
tads <- GInteractions(
  GRanges("chr6",
          IRanges(c(51130001, 51130001, 51450001, 52210001), width = 20000)),
  GRanges("chr6",
          IRanges(c(51530001, 52170001, 52210001, 53210001), width = 20000)))
range <- GRanges("chr6", IRanges(51120000, 53200000))
heatmap <- gi2track(gi)
# CTCF is a transcription factor that binds to DNA and is involved in chromatin organization
# the CTCF binding sites are often used as anchors for chromatin interactions
# ctcf data are base in ChIP-seq experiment
ctcf <- readRDS(system.file("extdata", "ctcf.sample.rds", package="trackViewer"))
ctcf
This is an object of track
 slot name:  
 slot type: data 
 slot format: BigWig 
slot dat:
GRanges object with 183 ranges and 1 metadata column:
        seqnames            ranges strand |     score
           <Rle>         <IRanges>  <Rle> | <numeric>
    [1]     chr6 51123519-51123819      * |         1
    [2]     chr6 51121987-51125687      * |         1
    [3]     chr6 51123514-51127314      * |         1
    [4]     chr6 51129218-51133018      * |         1
    [5]     chr6 51141148-51145048      * |         1
    ...      ...               ...    ... .       ...
  [179]     chr6 53151280-53153880      * |         1
  [180]     chr6 53152495-53152695      * |         1
  [181]     chr6 53160722-53162922      * |         1
  [182]     chr6 53160673-53165073      * |         1
  [183]     chr6 53195596-53198596      * |         1
  -------
  seqinfo: 22 sequences from an unspecified genome
slot dat2:
GRanges object with 0 ranges and 0 metadata columns:
   seqnames    ranges strand
      <Rle> <IRanges>  <Rle>
  -------
  seqinfo: no sequences
slot style: try object$style to see details.

Plotting interaction data

viewTracks(trackList(ctcf, heatmap, heightDist = c(1, 3)),
           gr=range, autoOptimizeStyle = TRUE)

Annotation of interaction data

## add TAD information
viewTracks(trackList(ctcf, heatmap, heightDist = c(1, 3)),
           gr=range, autoOptimizeStyle = TRUE)
## add TAD information
addInteractionAnnotation(tads, "heatmap", grid.lines, gp=gpar(col="#E69F00", lwd=3, lty=3))

Highlighting regions with high score

gi_sub <- gi[order(gi$score, decreasing = TRUE)]
gi_sub <- head(gi_sub[distance(first(gi_sub), second(gi_sub))>200000], n=5)
start(regions(gi_sub)) <- start(regions(gi_sub))-40000
end(regions(gi_sub)) <- end(regions(gi_sub))+40000
viewTracks(trackList(ctcf, heatmap, heightDist = c(1, 3)),
           gr=range, autoOptimizeStyle = TRUE)
## add TAD information
addInteractionAnnotation(tads, "heatmap", grid.lines, gp=gpar(col="#E69F00", lwd=3, lty=3))

addInteractionAnnotation(gi_sub, "heatmap", grid.polygon, gp=gpar(col="red", lwd=2, lty=2, fill=NA))